from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
앙상블 학습
앙상블 학습
보팅
배깅
부스팅
그래디언트 부스팅
스태킹
ref
선형대수와 통계학으로 배우는 머신러닝 with 파이썬
보팅
import
data
= datasets.load_iris() raw_iris
= raw_iris.data
X = raw_iris.target y
= train_test_split(X,y, random_state=0) X_tn, X_te, y_tn, y_te
# 데이터 표준화
= StandardScaler()
std_scale
std_scale.fit(X_tn)= std_scale.transform(X_tn)
X_tn_std = std_scale.transform(X_te) X_te_std
학습
# 보팅 학습
= LogisticRegression(multi_class='multinomial',
clf1 =1)
random_state= svm.SVC(kernel='linear',
clf2 =1)
random_state= GaussianNB() clf3
= VotingClassifier(
clf_voting =[
estimators'lr', clf1),
('svm', clf2),
('gnb', clf3)
(
],='hard',
voting=[1,1,1])
weights clf_voting.fit(X_tn_std, y_tn)
VotingClassifier(estimators=[('lr', LogisticRegression(multi_class='multinomial', random_state=1)), ('svm', SVC(kernel='linear', random_state=1)), ('gnb', GaussianNB())], weights=[1, 1, 1])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('lr', LogisticRegression(multi_class='multinomial', random_state=1)), ('svm', SVC(kernel='linear', random_state=1)), ('gnb', GaussianNB())], weights=[1, 1, 1])
LogisticRegression(multi_class='multinomial', random_state=1)
SVC(kernel='linear', random_state=1)
GaussianNB()
estimators는 미리 만든 세가지 모형
voting 옵션: hard(투표 결과 과반수가 넘는 라벨,defalut)/soft(확률이 가장 높은 라벨)
weights: 세 가지 모형의 비율
예측
= clf_voting.predict(X_te_std)
pred_voting print(pred_voting)
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
2]
평가
= accuracy_score(y_te, pred_voting)
accuracy print(accuracy)
0.9736842105263158
# confusion matrix 확인
= confusion_matrix(y_te, pred_voting)
conf_matrix print(conf_matrix)
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]
= classification_report(y_te, pred_voting)
class_report print(class_report)
precision recall f1-score support
0 1.00 1.00 1.00 13
1 1.00 0.94 0.97 16
2 0.90 1.00 0.95 9
accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38
랜덤포레스트
-
배깅
개별 분류기들의 분류 결과를 종합하여 최종 분류기의 성능 향상
부스트스트랩(booststrap)* 샘플 뽑아서 학습
*:중복을 허용한 랜덤 샘플 방법
-
랜덤 포레스트
n개의 데이터 랜덤 추출(중복가능)
p개의 피처 선택(중복 불가능)
의사 결정 나무 학습
(1)~(3) 반복
각 의사 결정 나무별 결과 투표를 통해 클래스 레이블 설정
import
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
data
= datasets.load_wine() raw_wine
= raw_wine.data
X = raw_wine.target y
=train_test_split(X,y,random_state=0) X_tn, X_te, y_tn, y_te
= StandardScaler()
std_scale
std_scale.fit(X_tn)= std_scale.transform(X_tn)
X_tn_std = std_scale.transform(X_te) X_te_std
학습
= RandomForestClassifier(max_depth=2,
clf_rf =0)
random_state clf_rf.fit(X_tn_std, y_tn)
RandomForestClassifier(max_depth=2, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=2, random_state=0)
예측
= clf_rf.predict(X_te_std)
pred_rf print(pred_rf)
[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 2 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
1 1 2 0 0 1 1 1]
평가
= accuracy_score(y_te, pred_rf)
accuracy print(accuracy)
= confusion_matrix(y_te, pred_rf)
conf_matrix print(conf_matrix)
= classification_report(y_te, pred_rf)
class_report print(class_report)
0.9555555555555556
[[16 0 0]
[ 1 19 1]
[ 0 0 8]]
precision recall f1-score support
0 0.94 1.00 0.97 16
1 1.00 0.90 0.95 21
2 0.89 1.00 0.94 8
accuracy 0.96 45
macro avg 0.94 0.97 0.95 45
weighted avg 0.96 0.96 0.96 45
배깅
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
= datasets.load_wine()
raw_wine
= raw_wine.data
X = raw_wine.target
y
=train_test_split(X,y,random_state=0)
X_tn, X_te, y_tn, y_te
= StandardScaler()
std_scale
std_scale.fit(X_tn)= std_scale.transform(X_tn)
X_tn_std = std_scale.transform(X_te) X_te_std
= BaggingClassifier(base_estimator=GaussianNB(),
clf_bagging =10,
n_estimators=0)
random_state clf_bagging.fit(X_tn_std, y_tn)
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/ensemble/_base.py:166: FutureWarning: `base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.
warnings.warn(
BaggingClassifier(base_estimator=GaussianNB(), random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BaggingClassifier(base_estimator=GaussianNB(), random_state=0)
GaussianNB()
GaussianNB()
= clf_bagging.predict(X_te_std)
pred_bagging print(pred_bagging)
[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 2 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
1 1 2 0 0 1 1 1]
= accuracy_score(y_te, pred_bagging)
accuracy print(accuracy)
= confusion_matrix(y_te, pred_bagging)
conf_matrix print(conf_matrix)
= classification_report(y_te, pred_bagging)
class_report print(class_report)
0.9555555555555556
[[16 0 0]
[ 1 19 1]
[ 0 0 8]]
precision recall f1-score support
0 0.94 1.00 0.97 16
1 1.00 0.90 0.95 21
2 0.89 1.00 0.94 8
accuracy 0.96 45
macro avg 0.94 0.97 0.95 45
weighted avg 0.96 0.96 0.96 45
부스팅
초기에는 모든 데이터에 동일한 가중치를 할당하다가 학습 진행되면 올바르게 분류된 데이터의 가중치는 감소하고 잘못 분류된 데이터의 가중치는 증가
이전 단계에서 만들어진 학습기는 다음 단계에서사용하는 트레이닝 셋의 가중치 변경
-
AdaBoost 알고리즘(이산형)
트레이닝 데이터의 각 데이터별 초기 가중치 \(w_i=\dfrac{1}{n}\)
\(j\)번째 약한 학습기 \(f_j(X_i)\)를 이용해 트레이닝 데이터 학습
(2)에서 사용한 약한 학습기 \(f_j(x_i)\)의 가중치가 적용된 오차율 구하기
\[e_j=\sum_{i=1}^n w_iI(y_i \neq f_j(x_i))\]
- 약한 학습기 전체에 적용될 가중치 \(\alpha_j\)를 구하기
\[\alpha_j=\dfrac{1}{2}log(\dfrac{1-e_j}{e_j})\]
-
AdaBoost 알고리즘(연속형)
- 수정필요
import
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
data
= datasets.load_breast_cancer()
raw_breast_cancer
= raw_breast_cancer.data
X = raw_breast_cancer.target
y
=train_test_split(X,y,random_state=0)
X_tn, X_te, y_tn, y_te
= StandardScaler()
std_scale
std_scale.fit(X_tn)= std_scale.transform(X_tn)
X_tn_std = std_scale.transform(X_te) X_te_std
학습
= AdaBoostClassifier(random_state=0)
clf_ada clf_ada.fit(X_tn_std, y_tn)
AdaBoostClassifier(random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AdaBoostClassifier(random_state=0)
예측
= clf_ada.predict(X_te_std)
pred_ada print(pred_ada)
[0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0
0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1
0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 0]
평가
= accuracy_score(y_te, pred_ada)
accuracy print(accuracy)
= confusion_matrix(y_te, pred_ada)
conf_matrix print(conf_matrix)
= classification_report(y_te, pred_ada)
class_report print(class_report)
0.9790209790209791
[[52 1]
[ 2 88]]
precision recall f1-score support
0 0.96 0.98 0.97 53
1 0.99 0.98 0.98 90
accuracy 0.98 143
macro avg 0.98 0.98 0.98 143
weighted avg 0.98 0.98 0.98 143
그래디언트 부스팅
- 비용 함수를 최적화시킴으로써 학습 능력 향상
import
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
data
= datasets.load_breast_cancer()
raw_breast_cancer
= raw_breast_cancer.data
X = raw_breast_cancer.target
y
=train_test_split(X,y,random_state=0)
X_tn, X_te, y_tn, y_te
= StandardScaler()
std_scale
std_scale.fit(X_tn)= std_scale.transform(X_tn)
X_tn_std = std_scale.transform(X_te) X_te_std
학습
= GradientBoostingClassifier(max_depth=2,
clf_gbt =0.01,
learning_rate=0)
random_state clf_gbt.fit(X_tn_std, y_tn)
GradientBoostingClassifier(learning_rate=0.01, max_depth=2, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingClassifier(learning_rate=0.01, max_depth=2, random_state=0)
예측
= clf_gbt.predict(X_te_std)
pred_gboost print(pred_gboost)
[0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
0 1 0 1 1 1 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 1
0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1
0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0]
평가
= accuracy_score(y_te, pred_gboost)
accuracy print(accuracy)
= confusion_matrix(y_te, pred_gboost)
conf_matrix print(conf_matrix)
= classification_report(y_te, pred_gboost)
class_report print(class_report)
0.965034965034965
[[49 4]
[ 1 89]]
precision recall f1-score support
0 0.98 0.92 0.95 53
1 0.96 0.99 0.97 90
accuracy 0.97 143
macro avg 0.97 0.96 0.96 143
weighted avg 0.97 0.97 0.96 143
스태킹
- 베이스 학습기가 먼저 학습한 후 메타 학습기가 베이스 학습기의 예측을 피처로 활용해 최종 예측
import
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
data
= datasets.load_breast_cancer()
raw_breast_cancer
= raw_breast_cancer.data
X = raw_breast_cancer.target
y
=train_test_split(X,y,random_state=0)
X_tn, X_te, y_tn, y_te
= StandardScaler()
std_scale
std_scale.fit(X_tn)= std_scale.transform(X_tn)
X_tn_std = std_scale.transform(X_te) X_te_std
학습
= svm.SVC(kernel='linear', random_state=1)
clf1 = GaussianNB()
clf2
= StackingClassifier(
clf_stkg =[
estimators'svm', clf1),
('gnb', clf2)
(
],=LogisticRegression())
final_estimator clf_stkg.fit(X_tn_std, y_tn)
StackingClassifier(estimators=[('svm', SVC(kernel='linear', random_state=1)), ('gnb', GaussianNB())], final_estimator=LogisticRegression())In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StackingClassifier(estimators=[('svm', SVC(kernel='linear', random_state=1)), ('gnb', GaussianNB())], final_estimator=LogisticRegression())
SVC(kernel='linear', random_state=1)
GaussianNB()
LogisticRegression()
예측
= clf_stkg.predict(X_te_std)
pred_stkg print(pred_stkg)
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0
0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 1
0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0]
평가
= accuracy_score(y_te, pred_stkg)
accuracy print(accuracy)
= confusion_matrix(y_te, pred_stkg)
conf_matrix print(conf_matrix)
= classification_report(y_te, pred_stkg)
class_report print(class_report)
0.965034965034965
[[50 3]
[ 2 88]]
precision recall f1-score support
0 0.96 0.94 0.95 53
1 0.97 0.98 0.97 90
accuracy 0.97 143
macro avg 0.96 0.96 0.96 143
weighted avg 0.96 0.97 0.96 143